library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(plotlyutils)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(GGally) ; library(ggExtra) ; library(ggpubr)
library(biomaRt) ; library(sva) ; library(WGCNA) ; library(vsn) ; library(Biobase) ; library(GEOquery) ; library(limma)
library(dendextend) ; library(expss)
library(knitr) ; library(kableExtra)
Dataset downloded from two different places:
# LOAD DATA
# Expression data downliaded from GEMMA (https://gemma.msl.ubc.ca/expressionExperiment/showExpressionExperiment.html?id=11805)
datExpr = read.delim('./../Data/11805_GSE102741_expmat.data.txt.gz', comment.char='#')
datGenes = datExpr %>% dplyr::select(Probe, Sequence, GeneSymbol, GeneName, GemmaId, NCBIid) %>%
dplyr::rename('entrezgene' = NCBIid)
datExpr = datExpr %>% dplyr::select(-c(Probe, Sequence, GeneSymbol, GeneName, GemmaId, NCBIid))
colnames(datExpr) = sapply(colnames(datExpr), function(x) strsplit(x, '\\.')[[1]][3]) %>% unname
rownames(datExpr) = datGenes$entrezgene
# Metadata downloaded from GEO
GEO_data = getGEO('GSE102741', destdir='./../Data/')[[1]]
datMeta = GEO_data@phenoData@data %>%
mutate(Diagnosis = factor(ifelse(grepl('control', characteristics_ch1), 'CTL', 'ASD'),
levels = c('CTL','ASD')),
Age = substring(characteristics_ch1.4, 6) %>% as.numeric %>% round,
Sex = `Sex:ch1`,
Sample_ID = description,
Ethnicity = substring(characteristics_ch1.6, 7),
title = gsub(' ', '', title)) %>%
dplyr::select(title, geo_accession, Sample_ID, Diagnosis, Age, Sex, Ethnicity)
datMeta = datMeta[match(colnames(datExpr), datMeta$title),]
# GO Neuronal annotations: regex 'neuron' in GO functional annotations and label the genes that make a match as neuronal
GO_annotations = read.csv('./../Data/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>%
mutate('ID'=as.character(ensembl_gene_id)) %>%
dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
mutate('Neuronal'=1)
# SFARI Genes
SFARI_genes = read_csv('./../../../SFARI/Data/SFARI_genes_01-03-2020_w_ensembl_IDs.csv')
SFARI_genes = SFARI_genes[!duplicated(SFARI_genes$ID) & !is.na(SFARI_genes$ID),]
# NCBI biotype annotation
NCBI_biotype = read.csv('./../../../NCBI/Data/gene_biotype_info.csv') %>%
dplyr::rename('ensembl_gene_id'=Ensembl_gene_identifier, 'gene_biotype'=type_of_gene,
'hgnc_symbol'=Symbol) %>%
mutate(gene_biotype = ifelse(gene_biotype=='protein-coding','protein_coding', gene_biotype))
gg_colour_hue = function(n) {
hues = seq(15, 375, length = n+1)
pal = hcl(h = hues, l = 65, c = 100)[1:n]
}
rm(GO_annotations)
Dataset consists of 52 samples (13 ASD and 39 Controls), all extracted from the DLPFC of the brain
Sequenced using Illumina’s HiSeq 2000 (Gupta used the same, Gandal used Illumina HiSeq 2500, they are compatible).
The dataset includes 25981 genes from 52 samples.
Counts distribution: The data has already been preprocessed, so we have relatively balanced values, centered close to 0
counts = datExpr %>% melt
count_distr = data.frame('Statistic' = c('Min', '1st Quartile', 'Median', 'Mean', '3rd Quartile', 'Max'),
'Values' = c(min(counts$value), quantile(counts$value, probs = c(.25, .5)) %>% unname,
mean(counts$value), quantile(counts$value, probs = c(.75)) %>% unname,
max(counts$value)))
count_distr %>% kable(digits = 2, format.args = list(scientific = FALSE)) %>% kable_styling(full_width = F)
| Statistic | Values |
|---|---|
| Min | -5.77 |
| 1st Quartile | -1.97 |
| Median | 1.06 |
| Mean | 0.89 |
| 3rd Quartile | 3.81 |
| Max | 16.47 |
rm(counts, count_distr)
Diagnosis distribution: There are three times more Control than ASD samples
table_info = datMeta %>% apply_labels(Diagnosis = 'Diagnosis', Sex = 'Gender')
cro(table_info$Diagnosis)
| #Total | |
|---|---|
| Diagnosis | |
| CTL | 39 |
| ASD | 13 |
| #Total cases | 52 |
Gender distribution: There are thrice as many Male samples than Female ones
cro(table_info$Sex)
| #Total | |
|---|---|
| Gender | |
| Female | 12 |
| Male | 40 |
| #Total cases | 52 |
Even though the gender is imbalanced, they are not biased by Diagnosis
cro(table_info$Diagnosis, list(table_info$Sex, total()))
| Gender | #Total | |||
|---|---|---|---|---|
| Female | Male | |||
| Diagnosis | ||||
| CTL | 9 | 30 | 39 | |
| ASD | 3 | 10 | 13 | |
| #Total cases | 12 | 40 | 52 | |
Age distribution: Subjects between 2 and 69 years old with a mean of 22 years old
summary(datMeta$Age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.00 9.75 17.00 22.38 27.25 69.00
datMeta %>% ggplot(aes(Age)) +
geom_density(alpha=0.2, aes(group = Diagnosis, fill = Diagnosis, color = Diagnosis)) +
geom_density(alpha=0.3, fill='gray', color='gray') +
theme_minimal()
I was originally running this with the feb2014 version of BioMart because that’s the one that Gandal used (and it finds all of the Ensembl IDs, which other versions don’t), but it has some outdated biotype annotations, to fix this I’ll obtain all the information except the biotype label from BioMart in the same way as it had been done before, and then I’ll add the most current biotype label using information from NCBI’s website and then from BioMart in the following way:
Use BioMart to run a query with the original feb2014 version using the Ensembl IDs as keys to obtain all the information except the biotype labels (we are not going to retrieve the gene name from biomart because we already extracted it from datExpr)
Annotate genes with Biotype labels:
2.1 Use the NCBI annotations downloaded from NCBI’s website and processed in NCBI/RMarkdowns/clean_data.html (there is information for only 26K genes, so some genes will remain unlabelled)
2.2 Use the current version (jan2020) to obtain the biotype annotations using the Ensembl ID as keys (some genes don’t return a match)
2.3 For the genes that didn’t return a match, use the current version (jan2020) to obtain the biotype annotations using the gene name as keys
2.4 For the genes that returned multiple labels, use the feb2014 version with the Ensembl IDs as keys
Note: A small proportion of genes don’t make a match in any of these queries, so they will be lost when we start filtering out genes
labels_source = data.frame('source' = c('NCBI', 'BioMart2020_byID', 'BioMart2020_byGene', 'BioMart2014'),
'n_matches' = rep(0,4))
########################################################################################
# 1. Query archive version
# Note: NCBI ID = entrez ID
getinfo = c('entrezgene','ensembl_gene_id','hgnc_symbol','chromosome_name','start_position','end_position','strand')
mart = useMart(biomart = 'ENSEMBL_MART_ENSEMBL', dataset = 'hsapiens_gene_ensembl',
host = 'feb2014.archive.ensembl.org')
datGenes_BM = getBM(attributes = getinfo, filters = c('entrezgene'), values = rownames(datExpr),
mart = mart)
datGenes = datGenes %>% mutate(entrezgene = entrezgene %>% as.character %>% as.integer) %>%
left_join(datGenes_BM, by = 'entrezgene')
datGenes$length = datGenes$end_position - datGenes$start_position
cat(paste0('1. ', sum(is.na(datGenes$start_position)), '/', nrow(datGenes),
' Ensembl IDs weren\'t found in the feb2014 version of BioMart'))
## 1. 7504/29340 Ensembl IDs weren't found in the feb2014 version of BioMart
########################################################################################
########################################################################################
# 2. Get Biotype Labels
cat('2. Add biotype information')
## 2. Add biotype information
########################################################################################
# 2.1 Add NCBI annotations
datGenes = datGenes %>% left_join(NCBI_biotype, by=c('ensembl_gene_id','hgnc_symbol'))
cat(paste0('2.1 ' , sum(is.na(datGenes$gene_biotype)), '/', nrow(datGenes),
' Ensembl IDs weren\'t found in the NCBI database'))
## 2.1 13338/29340 Ensembl IDs weren't found in the NCBI database
labels_source$n_matches[1] = sum(!is.na(datGenes$gene_biotype))
########################################################################################
# 2.2 Query current BioMart version for gene_biotype using Ensembl ID as key
getinfo = c('ensembl_gene_id','gene_biotype')
mart = useMart(biomart = 'ENSEMBL_MART_ENSEMBL', dataset = 'hsapiens_gene_ensembl',
host = 'jan2020.archive.ensembl.org')
datGenes_biotype = getBM(attributes = getinfo, filters = c('ensembl_gene_id'), mart=mart,
values = datGenes$ensembl_gene_id[is.na(datGenes$gene_biotype)])
cat(paste0('2.2 ' , sum(is.na(datGenes$gene_biotype))-nrow(datGenes_biotype), '/',
sum(is.na(datGenes$gene_biotype)),
' Ensembl IDs weren\'t found in the jan2020 version of BioMart when querying by Ensembl ID'))
## 2.2 9667/13338 Ensembl IDs weren't found in the jan2020 version of BioMart when querying by Ensembl ID
# Add new gene_biotype info to datGenes
datGenes = datGenes %>% left_join(datGenes_biotype, by='ensembl_gene_id') %>%
mutate(gene_biotype = coalesce(as.character(gene_biotype.x), gene_biotype.y)) %>%
dplyr::select(-gene_biotype.x, -gene_biotype.y)
labels_source$n_matches[2] = sum(!is.na(datGenes$gene_biotype)) - labels_source$n_matches[1]
########################################################################################
# 3. Query current BioMart version for gene_biotype using gene symbol as key
missing_genes = unique(datGenes$hgnc_symbol[is.na(datGenes$gene_biotype)])
getinfo = c('hgnc_symbol','gene_biotype')
datGenes_biotype_by_gene = getBM(attributes = getinfo, filters = c('hgnc_symbol'), mart = mart,
values = missing_genes)
cat(paste0('2.3 ', length(missing_genes)-length(unique(datGenes_biotype_by_gene$hgnc_symbol)),'/',
length(missing_genes),
' genes weren\'t found in the current BioMart version when querying by gene name'))
## 2.3 148/1279 genes weren't found in the current BioMart version when querying by gene name
dups = unique(datGenes_biotype_by_gene$hgnc_symbol[duplicated(datGenes_biotype_by_gene$hgnc_symbol)])
cat(paste0(' ', length(dups), ' genes returned multiple labels (these won\'t be added)'))
## 3 genes returned multiple labels (these won't be added)
# Update information
datGenes_biotype_by_gene = datGenes_biotype_by_gene %>% filter(!hgnc_symbol %in% dups)
datGenes = datGenes %>% left_join(datGenes_biotype_by_gene, by='hgnc_symbol') %>%
mutate(gene_biotype = coalesce(gene_biotype.x, gene_biotype.y)) %>%
dplyr::select(-gene_biotype.x, -gene_biotype.y)
labels_source$n_matches[3] = sum(!is.na(datGenes$gene_biotype)) - sum(labels_source$n_matches)
########################################################################################
# 4. Query feb2014 BioMart version for the missing biotypes
missing_ensembl_ids = unique(datGenes$ensembl_gene_id[is.na(datGenes$gene_biotype)])
getinfo = c('ensembl_gene_id','gene_biotype')
mart = useMart(biomart = 'ENSEMBL_MART_ENSEMBL', dataset = 'hsapiens_gene_ensembl',
host = 'feb2014.archive.ensembl.org')
datGenes_biotype_archive = getBM(attributes = getinfo, filters=c('ensembl_gene_id'),
values = missing_ensembl_ids, mart=mart)
cat(paste0('2.4 ', length(missing_ensembl_ids)-nrow(datGenes_biotype_archive),'/',length(missing_ensembl_ids),
' genes weren\'t found in the feb2014 BioMart version when querying by Ensembl ID'))
## 2.4 1/367 genes weren't found in the feb2014 BioMart version when querying by Ensembl ID
# Update information
datGenes = datGenes %>% left_join(datGenes_biotype_archive, by='ensembl_gene_id') %>%
mutate(gene_biotype = coalesce(gene_biotype.x, gene_biotype.y)) %>%
dplyr::select(-gene_biotype.x, -gene_biotype.y)
labels_source$n_matches[4] = sum(!is.na(datGenes$gene_biotype)) - sum(labels_source$n_matches)
########################################################################################
# Plot results
labels_source = labels_source %>% add_row(source = 'missing',
n_matches = nrow(datGenes) - sum(labels_source$n_matches)) %>%
mutate(x = 1, percentage = round(100*n_matches/sum(n_matches),2),
source = factor(source, levels=c('BioMart2014','BioMart2020_byGene','BioMart2020_byID',
'NCBI','missing')))
p = labels_source %>% ggplot(aes(x, percentage, fill=source)) + geom_bar(position='stack', stat='identity') +
theme_minimal() + coord_flip() + theme(legend.position='bottom', axis.title.y=element_blank(),
axis.text.y=element_blank(), axis.ticks.y=element_blank()) + ylab('Percentage of genes') +
scale_fill_manual(values = c(gg_colour_hue(nrow(labels_source)-1),'gray'))
ggplotly(p + theme(legend.position='none'))
as_ggplot(get_legend(p))
########################################################################################
# Reorder rows to match datExpr
datGenes = datGenes[match(rownames(datExpr), datGenes$entrezgene),]
rm(getinfo, mart, datGenes_BM, datGenes_biotype, datGenes_biotype_by_gene, datGenes_biotype_archive,
dups, missing_ensembl_ids, missing_genes, labels_source, p)
Checking how many SFARI genes are in the dataset
df = SFARI_genes %>% dplyr::select(-gene_biotype) %>% inner_join(datGenes, by=c('ID'='ensembl_gene_id'))
n_SFARI = df[['gene-symbol']] %>% unique %>% length
Considering all genes, this dataset contains 786 of the 912 SFARI genes
1.- Filter entries for which we didn’t manage to obtain its genotype information
1.1 Missing Biotype
to_keep = !is.na(datGenes$gene_biotype)
datGenes = datGenes[to_keep,]
datExpr = datExpr[to_keep,]
rownames(datGenes) = datGenes$entrezgene
Removed 7504 ‘genes’, 18477 remaining
Filtering genes without biotype information, we are left with 786 SFARI Genes (we lose 0 genes)
1.2 Missing Length of the sequence
to_keep = !is.na(datGenes$length)
datExpr = datExpr[to_keep,]
datGenes = datGenes[to_keep,]
Removed 0 ‘genes’, 18477 remaining
Filtering genes without sequence length information, we are left with 786 SFARI Genes (we lose 0 genes)
2.- Filter genes that do not encode any protein
85% of the genes are protein coding genes
datGenes$gene_biotype %>% table %>% sort(decreasing=TRUE) %>% kable(caption='Biotypes of genes in dataset') %>%
kable_styling(full_width = F)
| . | Freq |
|---|---|
| protein_coding | 15790 |
| lncRNA | 1310 |
| 1 | 703 |
| 3 | 208 |
| transcribed_unprocessed_pseudogene | 133 |
| processed_pseudogene | 84 |
| 6 | 46 |
| transcribed_processed_pseudogene | 38 |
| lincRNA | 31 |
| transcribed_unitary_pseudogene | 28 |
| unprocessed_pseudogene | 26 |
| pseudogene | 20 |
| antisense | 18 |
| 7 | 12 |
| processed_transcript | 9 |
| snoRNA | 6 |
| misc_RNA | 4 |
| sense_intronic | 2 |
| 3prime_overlapping_ncrna | 1 |
| miRNA | 1 |
| polymorphic_pseudogene | 1 |
| rRNA | 1 |
| scaRNA | 1 |
| sense_overlapping | 1 |
| snRNA | 1 |
| TEC | 1 |
| TR_C_gene | 1 |
Most of the non-protein coding genes have very low levels of expression
plot_data = data.frame('ID' = rownames(datExpr), 'MeanExpr' = apply(datExpr, 1, mean),
'ProteinCoding' = datGenes$gene_biotype=='protein_coding')
ggplotly(plot_data %>% ggplot(aes(MeanExpr, fill=ProteinCoding, color=ProteinCoding)) +
geom_density(alpha=0.5) + theme_minimal())
rm(plot_data)
df = SFARI_genes %>% dplyr::select(-gene_biotype) %>% inner_join(datGenes, by=c('ID'='ensembl_gene_id'))
Filtering protein coding genes, we are left with 783 SFARI Genes (we lose 3 genes)
Note: The gene name for Ensembl ID ENSG00000187951 is wrong, it should be AC091057.1 instead of ARHGAP11B, but the biotype is right, so it would still be filtered out
n_SFARI = df[['gene-symbol']][df$gene_biotype=='protein_coding'] %>% unique %>% length
df %>% filter(!`gene-symbol` %in% df$`gene-symbol`[df$gene_biotype=='protein_coding']) %>%
dplyr::select(ID, `gene-symbol`, `gene-score`, gene_biotype, syndromic, `number-of-reports`) %>%
kable(caption='Lost Genes') %>% kable_styling(full_width = F)
| ID | gene-symbol | gene-score | gene_biotype | syndromic | number-of-reports |
|---|---|---|---|---|---|
| ENSG00000187951 | ARHGAP11B | 3 | lncRNA | 0 | 2 |
| ENSG00000187951 | ARHGAP11B | 3 | lncRNA | 0 | 2 |
| ENSG00000224639 | C4B | 3 | lncRNA | 0 | 6 |
| ENSG00000233067 | PTCHD1-AS | 2 | lncRNA | 0 | 3 |
rm(df)
if(!all(rownames(datExpr)==rownames(datGenes))) cat('!!! gene rownames do not match!!!')
to_keep = datGenes$gene_biotype == 'protein_coding'
datExpr = datExpr %>% filter(to_keep)
datGenes = datGenes %>% filter(to_keep)
rownames(datExpr) = datGenes$entrezgene
rownames(datGenes) = datGenes$entrezgene
Removed 2687 genes. 15790 remaining
3.- Filter genes with low expression levels
This seems to have already been done in the original preprocessing of the data, so I won’t do it again
4.- Filter outlier samples
Using node connectivity as a distance measure, normalising it and filtering out genes farther away than 2 standard deviations from the left (lower connectivity than average, not higher)
Gandal uses the formula \(s_{ij}=\frac{1+bw(i,j)}{2}\) to convert all the weights to positive values, but I used \(s_{ij}=|bw(i,j)|\) instead because I think it makes more sense. In the end it doesn’t matter because they select as outliers the same six samples
Only 3 outliers, all belonging to the control group and relatively young (under 6 yo)
absadj = datExpr %>% bicor %>% abs
netsummary = fundamentalNetworkConcepts(absadj)
ku = netsummary$Connectivity
z.ku = (ku-mean(ku))/sqrt(var(ku))
plot_data = data.frame('sample'=1:length(z.ku), 'distance'=z.ku, 'Sample_ID'=datMeta$Sample_ID,
'Sex'=datMeta$Sex, 'Age'=datMeta$Age, 'Diagnosis'=datMeta$Diagnosis)
selectable_scatter_plot(plot_data, plot_data[,-c(1:3)])
Outlier samples: sample25, sample3, sample31
to_keep = z.ku > -2
datMeta = datMeta[to_keep,]
datExpr = datExpr[,to_keep]
rm(absadj, netsummary, ku, z.ku, plot_data)
Removed 3 samples, 49 remaining
rm(to_keep)
5.- Filter repeated genes
There are 398 genes with more than one ensembl ID in the dataset. To accurately refer to the rows of my data as ‘genes’, I’m going to remove the repeated ones.
dup_genes = datGenes$hgnc_symbol %>% duplicated
datGenes = datGenes[!dup_genes,]
datExpr = datExpr[!dup_genes,]
Removed 398 genes. 15392 remaining
782 SFARI genes remaining (we lost 783 genes)
rm(dup_genes, n_SFARI)
save(datExpr, datMeta, datGenes, file='./../Data/filtered_raw_data.RData')
#load('./../Data/filtered_raw_data.RData')
I’m going to see how much heteroscedasticity is in this dataset
Using the plotting function DESEq2’s manual proposes to study homoscedasticity, it seems like the genes with the lowest level of expression have higher SD than the rest
meanSdPlot(datExpr %>% as.matrix, plot=FALSE)$gg + theme_minimal()
Plotting points individually we can notice even more heteroscedasticity in the data
plot_data = data.frame('ID'=rownames(datExpr), 'Mean'=rowMeans(datExpr), 'SD'=apply(datExpr,1,sd))
plot_data %>% ggplot(aes(Mean, SD)) + geom_point(color='#0099cc', alpha=0.2) + geom_smooth(color = 'gray') +
scale_y_log10() + theme_minimal()
rm(plot_data)
It could be useful to work a bit more to obtain the raw counts of this dataset and perform the normalisation myself to see if I can reduce the heterocedasticity we found, but for now, this will have to do
Since our data has already been preprocessed, we cannot use DESeq2 to perform the differential expression analysis. instead I’m going to use limma following the procedure in RNA-seq analysis is easy as 1-2-3 with limma, Glimma and edgeR
fit = lmFit(datExpr, design=model.matrix(~Diagnosis, data = datMeta))
efit = eBayes(fit)
DE_info = topTable(efit, sort.by = 'none', n = Inf)
## Removing intercept from test coefficients
rm(fit)
This dataset was processed in a single batch, so we don’t have any batch-related variables such as processing date or processing lab
Samples don’t seem to cluster together that strongly by any variable
h_clusts = datExpr %>% t %>% dist %>% hclust %>% as.dendrogram
create_viridis_dict = function(){
min_age = datMeta$Age %>% min
max_age = datMeta$Age %>% max
viridis_age_cols = viridis(max_age - min_age + 1)
names(viridis_age_cols) = seq(min_age, max_age)
return(viridis_age_cols)
}
viridis_age_cols = create_viridis_dict()
dend_meta = datMeta[match(labels(h_clusts), datMeta$title),] %>%
mutate('Diagnosis' = ifelse(Diagnosis=='CTL','#008080','#86b300'), # Blue control, Green ASD
'Sex' = ifelse(Sex=='Female','#ff6666','#008ae6'), # Pink Female, Blue Male
'Age' = viridis_age_cols[as.character(Age)]) %>% # Purple: young, Yellow: old
dplyr::select(Age, Sex, Diagnosis)
h_clusts %>% dendextend::set('labels', rep('', nrow(datMeta))) %>%
dendextend::set('branches_k_color', k=9) %>% plot
colored_bars(colors=dend_meta)
rm(h_clusts, dend_meta, create_viridis_dict, viridis_age_cols)
mod = model.matrix(~Diagnosis, data = datMeta %>% dplyr::select(Diagnosis, Age, Sex, Ethnicity))
mod0 = model.matrix(~1, data = datMeta %>% dplyr::select(Diagnosis, Age, Sex, Ethnicity))
sva_fit = svaseq(exp(datExpr) %>% as.matrix, mod = mod, mod0 = mod0)
## Number of significant surrogate variables is: 8
## Iteration (out of 5 ):1 2 3 4 5
rm(mod, mod0)
Found 8 surrogate variables
Include SV estimations to datMeta information
sv_data = sva_fit$sv %>% data.frame
colnames(sv_data) = paste0('SV', 1:ncol(sv_data))
datMeta = cbind(datMeta, sv_data)
rm(sv_data, sva_fit)
By including the surrogate variables in the DESeq formula we only modelled the batch effects into the DEA, but we didn’t actually correct them from the data, for that we need to use ComBat (or other equivalent package) in the already normalised data
In some places they say you shouldn’t correct these effects on the data because you risk losing biological variation, in others they say you should because they introduce noise to the data. The only thing everyone agrees on is that you shouldn’t remove them before performing DEA but instead include them in the model.
Based on the conclusions from Practical impacts of genomic data “cleaning” on biological discovery using surrogate variable analysis it seems like it may be a good idea to remove the batch effects from the data and not only from the DE analysis:
Using SVA, ComBat or related tools can increase the power to identify specific signals in complex genomic datasets (they found “greatly sharpened global and gene-specific differential expression across treatment groups”)
But caution should be exercised to avoid removing biological signal of interest
We must be precise and deliberate in the design and analysis of experiments and the resulting data, and also mindful of the limitations we impose with our own perspective
Open data exploration is not possible after such supervised “cleaning”, because effects beyond those stipulated by the researcher may have been removed
# Taken from https://www.biostars.org/p/121489/#121500
correctDatExpr = function(datExpr, mod, svs) {
X = cbind(mod, svs)
Hat = solve(t(X) %*% X) %*% t(X)
beta = (Hat %*% t(datExpr))
rm(Hat)
gc()
P = ncol(mod)
return(datExpr - t(as.matrix(X[,-c(1:P)]) %*% beta[-c(1:P),]))
}
pca_samples_before = datExpr %>% t %>% prcomp
pca_genes_before = datExpr %>% prcomp
# Correct
mod = model.matrix(~ Diagnosis, data = datMeta)
svs = datMeta %>% dplyr::select(contains('SV')) %>% as.matrix
datExpr_corrected = correctDatExpr(as.matrix(datExpr), mod, svs)
pca_samples_after = datExpr_corrected %>% t %>% prcomp
pca_genes_after = datExpr_corrected %>% prcomp
rm(correctDatExpr)
Removing batch effects has a big impact in the distribution of the samples, separating them by diagnosis relatively well
This time, both PC1 and PC2 seem to play a role in separating the samples by Diagnosis instead of just the 1st PC as we had seen in Gandal
pca_samples_df = rbind(data.frame('ID'=colnames(datExpr), 'PC1'=pca_samples_before$x[,1],
'PC2'=pca_samples_before$x[,2], 'corrected'=0),
data.frame('ID'=colnames(datExpr), 'PC1'=pca_samples_after$x[,1],
'PC2'=pca_samples_after$x[,2], 'corrected'=1)) %>%
left_join(datMeta %>% mutate('ID'=datMeta$title), by='ID')
ggplotly(pca_samples_df %>% ggplot(aes(PC1, PC2, color=Diagnosis)) + geom_point(aes(frame=corrected, id=ID), alpha=0.75) +
xlab(paste0('PC1 (corr=', round(cor(pca_samples_before$x[,1],pca_samples_after$x[,1]),2),
'). % Var explained: ', round(100*summary(pca_samples_before)$importance[2,1],1),' to ',
round(100*summary(pca_samples_after)$importance[2,1],1))) +
ylab(paste0('PC2 (corr=', round(cor(pca_samples_before$x[,2],pca_samples_after$x[,2]),2),
'). % Var explained: ', round(100*summary(pca_samples_before)$importance[2,2],1),' to ',
round(100*summary(pca_samples_after)$importance[2,2],1))) +
ggtitle('Samples') + theme_minimal())
rm(pca_samples_df)
It seems like the sva correction preserves the mean expression of the genes and erases almost everything else (although what little else remains is enough to characterise the two Diagnosis groups pretty well using only the first PC)
Genes with lower levels of expression have much higher variances in the 2nd principal components than the rest of the genes
*Plot is done with only 10% of the genes so it’s not that heavy
pca_genes_df = rbind(data.frame('ID'=rownames(datExpr), 'PC1'=pca_genes_before$x[,1],
'PC2'=pca_genes_before$x[,2], 'corrected'=0, 'MeanExpr'=rowMeans(datExpr)),
data.frame('ID'=rownames(datExpr), 'PC1'=pca_genes_after$x[,1],
'PC2'=pca_genes_after$x[,2], 'corrected'=1, 'MeanExpr'=rowMeans(datExpr)))
keep_genes = rownames(datExpr) %>% sample(0.1*nrow(datExpr))
pca_genes_df = pca_genes_df %>% filter(ID %in% keep_genes)
ggplotly(pca_genes_df %>% ggplot(aes(PC1, PC2,color=MeanExpr)) +
geom_point(alpha=0.3, aes(frame=corrected, id=ID)) +
xlab(paste0('PC1 (corr=', round(cor(pca_genes_before$x[,1],pca_genes_after$x[,1]),2),
'). % Var explained: ', round(100*summary(pca_genes_before)$importance[2,1],1),' to ',
round(100*summary(pca_genes_after)$importance[2,1],1))) +
ylab(paste0('PC2 (corr=', round(cor(pca_genes_before$x[,2],pca_genes_after$x[,2]),2),
'). % Var explained: ', round(100*summary(pca_genes_before)$importance[2,2],1),' to ',
round(100*summary(pca_genes_after)$importance[2,2],1))) +
scale_color_viridis() + ggtitle('Genes') + theme_minimal())
rm(pca_samples_before, pca_genes_before, mod, svs, pca_samples_after, pca_genes_after, pca_genes_df, keep_genes)
Everything looks good, so we’re keeping the corrected expression dataset
datExpr = datExpr_corrected
rm(datExpr_corrected)
save(datExpr, datMeta, datGenes, DE_info, efit, file='./../Data/preprocessed_data.RData')
sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.4 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] kableExtra_1.1.0 knitr_1.28 expss_0.10.2
## [4] dendextend_1.13.4 limma_3.40.6 GEOquery_2.52.0
## [7] vsn_3.52.0 Biobase_2.44.0 BiocGenerics_0.30.0
## [10] WGCNA_1.69 fastcluster_1.1.25 dynamicTreeCut_1.63-1
## [13] sva_3.32.1 BiocParallel_1.18.1 genefilter_1.66.0
## [16] mgcv_1.8-31 nlme_3.1-147 biomaRt_2.40.5
## [19] ggpubr_0.2.5 magrittr_1.5 ggExtra_0.9
## [22] GGally_1.5.0 gridExtra_2.3 viridis_0.5.1
## [25] viridisLite_0.3.0 RColorBrewer_1.1-2 plotlyutils_0.0.0.9000
## [28] plotly_4.9.2 glue_1.4.1 reshape2_1.4.4
## [31] forcats_0.5.0 stringr_1.4.0 dplyr_1.0.0
## [34] purrr_0.3.4 readr_1.3.1 tidyr_1.1.0
## [37] tibble_3.0.1 ggplot2_3.3.2 tidyverse_1.3.0
##
## loaded via a namespace (and not attached):
## [1] readxl_1.3.1 backports_1.1.8 Hmisc_4.4-0
## [4] plyr_1.8.6 lazyeval_0.2.2 splines_3.6.3
## [7] crosstalk_1.1.0.1 digest_0.6.25 foreach_1.5.0
## [10] htmltools_0.4.0 GO.db_3.8.2 fansi_0.4.1
## [13] checkmate_2.0.0 memoise_1.1.0 cluster_2.1.0
## [16] doParallel_1.0.15 annotate_1.62.0 modelr_0.1.6
## [19] matrixStats_0.56.0 prettyunits_1.1.1 jpeg_0.1-8.1
## [22] colorspace_1.4-1 blob_1.2.1 rvest_0.3.5
## [25] haven_2.2.0 xfun_0.12 hexbin_1.28.1
## [28] crayon_1.3.4 RCurl_1.98-1.2 jsonlite_1.7.0
## [31] impute_1.58.0 survival_3.1-12 iterators_1.0.12
## [34] gtable_0.3.0 zlibbioc_1.30.0 webshot_0.5.2
## [37] scales_1.1.1 DBI_1.1.0 miniUI_0.1.1.1
## [40] Rcpp_1.0.4.6 xtable_1.8-4 progress_1.2.2
## [43] htmlTable_1.13.3 foreign_0.8-76 bit_1.1-15.2
## [46] preprocessCore_1.46.0 Formula_1.2-3 stats4_3.6.3
## [49] htmlwidgets_1.5.1 httr_1.4.1 acepack_1.4.1
## [52] ellipsis_0.3.1 farver_2.0.3 pkgconfig_2.0.3
## [55] reshape_0.8.8 XML_3.99-0.3 nnet_7.3-14
## [58] dbplyr_1.4.2 labeling_0.3 tidyselect_1.1.0
## [61] rlang_0.4.6 later_1.0.0 AnnotationDbi_1.46.1
## [64] munsell_0.5.0 cellranger_1.1.0 tools_3.6.3
## [67] cli_2.0.2 generics_0.0.2 RSQLite_2.2.0
## [70] broom_0.5.5 evaluate_0.14 fastmap_1.0.1
## [73] yaml_2.2.1 bit64_0.9-7 fs_1.4.0
## [76] mime_0.9 xml2_1.2.5 compiler_3.6.3
## [79] rstudioapi_0.11 curl_4.3 png_0.1-7
## [82] affyio_1.54.0 ggsignif_0.6.0 reprex_0.3.0
## [85] stringi_1.4.6 highr_0.8 lattice_0.20-41
## [88] Matrix_1.2-18 vctrs_0.3.1 pillar_1.4.4
## [91] lifecycle_0.2.0 BiocManager_1.30.10 cowplot_1.0.0
## [94] data.table_1.12.8 bitops_1.0-6 httpuv_1.5.2
## [97] affy_1.62.0 R6_2.4.1 latticeExtra_0.6-29
## [100] promises_1.1.0 IRanges_2.18.3 codetools_0.2-16
## [103] assertthat_0.2.1 withr_2.2.0 S4Vectors_0.22.1
## [106] hms_0.5.3 grid_3.6.3 rpart_4.1-15
## [109] rmarkdown_2.1 shiny_1.4.0.2 lubridate_1.7.4
## [112] base64enc_0.1-3